version 18.0               // version control
set processors 8           // to ensure replicability across different numbers of cores
clear all                  // clear existing data
macro drop _all            // and macros, clean slate
set seed 20220909          // set seed
set scheme cleanplots      // clean plots
/* 
Notes: 
1) Must put "set sortseed #" right on top of EACH "telasso" (rather than at the top of the do file) to ensure the same number of selected controls will be reported.
2) For detailed explanations, see Stata's technical support's email chain on 10/10/2023. 
*/

local pgm  "dst-Figure_8a_depression_telasso_donut_hole"              // file name
local who  "Muzhe Yang"                                               // author
local dte  "2025-01-21"                                               // created date
local dte2 "`c(current_date)'"                                        // last run date
local tag  "`pgm'.do, created by `who' on `dte', last run on `dte2'"

capture log close
log using "code\analysis\figures\\`pgm'.txt", text replace 
display "`tag'"

**# data prep ------------------------------------------------------------------------------------

use "data_clean\dst-data04_for_estimation_within_250_miles", clear
keep if wave == 2

*## (1) create the 9 cells, following page 215 of the following paper:
*## Giuntella, O. and F. Mazzonna (2019). "Sunset Time and the Economic Effects of Social Jetlag: Evidence from US Time Zone Borders." Journal of Health Economics 65: 210-226.

assert !missing(centroid_lat)
assert !missing(region)
gen     cell = 1 if centroid_lat > 40                        & region == "eastern and central"
replace cell = 2 if (34 < centroid_lat & centroid_lat <= 40) & region == "eastern and central"
replace cell = 3 if centroid_lat <= 34                       & region == "eastern and central"
replace cell = 4 if centroid_lat > 40                        & region == "central and mountain"
replace cell = 5 if (34 < centroid_lat & centroid_lat <= 40) & region == "central and mountain"
replace cell = 6 if centroid_lat <= 34                       & region == "central and mountain"
replace cell = 7 if centroid_lat > 40                        & region == "mountain and pacific"
replace cell = 8 if (34 < centroid_lat & centroid_lat <= 40) & region == "mountain and pacific"
replace cell = 9 if centroid_lat <= 34                       & region == "mountain and pacific"

*## (2) control variables

global x "pop white_pct black_pct hispanic_pct pop_under_18yr_pct pop_65yr_over_pct age_median educ_hs_pct educ_coll_pct married_pct hh_size hh_income_median home_value_median no_health_ins_pct unemploy_pct"
global cvars    c.($x dist_to_border centroid_lat daylight_dur_max)##c.($x dist_to_border centroid_lat daylight_dur_max)
global controls $cvars i.cell i.cell#c.($x dist_to_border centroid_lat daylight_dur_max) 

**# estimation prep -------------------------------------------------

encode CountyFIPS, gen(county_fips)
egen nomiss = rowmiss($x dist_to_border centroid_lat daylight_dur_max)
assert !missing(StateAbbr)
local radius = 50
gen sample_selection = (nomiss == 0 & StateAbbr != "AZ" & dist_to_border <= `radius')
global xfolds_resample "xfolds(10) resample(3) nolog"

**# figure ----------------------------------------------------------------------------------------------

set sortseed 12102023
telasso (depression $controls) (treat $controls) if sample_selection == 1, vce(cluster county_fips) $xfolds_resample rseed(10101)
estimates store all_0_mile_hole
summ dist_to_border if e(sample)

set sortseed 12102023
telasso (depression $controls) (treat $controls) if (sample_selection == 1 & (dist_to_border > 5 & !missing(dist_to_border))), vce(cluster county_fips) $xfolds_resample rseed(10101)
estimates store all_5_mile_hole
summ dist_to_border if e(sample)

set sortseed 12102023
telasso (depression $controls) (treat $controls) if (sample_selection == 1 & (dist_to_border > 10 & !missing(dist_to_border))), vce(cluster county_fips) $xfolds_resample rseed(10101)
estimates store all_10_mile_hole
summ dist_to_border if e(sample)

set sortseed 12102023
telasso (depression $controls) (treat $controls) if (sample_selection == 1 & (dist_to_border > 15 & !missing(dist_to_border))), vce(cluster county_fips) $xfolds_resample rseed(10101)
estimates store all_15_mile_hole
summ dist_to_border if e(sample)

set sortseed 12102023
telasso (depression $controls) (treat $controls) if (sample_selection == 1 & (dist_to_border > 20 & !missing(dist_to_border))), vce(cluster county_fips) $xfolds_resample rseed(10101)
estimates store all_20_mile_hole
summ dist_to_border if e(sample)

set sortseed 12102023
telasso (depression $controls) (treat $controls) if (sample_selection == 1 & (dist_to_border > 25 & !missing(dist_to_border))), vce(cluster county_fips) $xfolds_resample rseed(10103)
estimates store all_25_mile_hole
summ dist_to_border if e(sample)
/*
When using "rseed(10101)", we got the following error message:
"treatment 0 has 1 propensity scores less than 1.00e-05. Treatment overlap assumption has been violated; use option osample() to identify the overlap violators."
When using "rseed(10102)", we got the following error message:
"note: convergence for the lasso penalty = 0.269375 not reached after 100000 iterations; solutions for larger penalty values returned. no variable is selected"
As a result, we used "rseed(10103)" and then did't get any error message. 
*/

set sortseed 12102023
telasso (depression $controls) (treat $controls) if (sample_selection == 1 & (dist_to_border > 30 & !missing(dist_to_border))), vce(cluster county_fips) $xfolds_resample rseed(10101)
estimates store all_30_mile_hole
summ dist_to_border if e(sample)

set sortseed 12102023
telasso (depression $controls) (treat $controls) if (sample_selection == 1 & (dist_to_border > 35 & !missing(dist_to_border))), vce(cluster county_fips) $xfolds_resample rseed(10102)
estimates store all_35_mile_hole
summ dist_to_border if e(sample)
/*
When using "rseed(10101)", we got the following error message:
"note: convergence for the lasso penalty = 0.402120 not reached after 100000 iterations; solutions for larger penalty values returned."
As a result, we used "rseed(10102)" and then did't get any error message. 
*/

set sortseed 12102023
telasso (depression $controls) (treat $controls) if (sample_selection == 1 & (dist_to_border > 40 & !missing(dist_to_border))), vce(cluster county_fips) $xfolds_resample rseed(10101)
estimates store all_40_mile_hole
summ dist_to_border if e(sample)

set sortseed 12102023
telasso (depression $controls) (treat $controls) if (sample_selection == 1 & (dist_to_border > 45 & !missing(dist_to_border))), vce(cluster county_fips) $xfolds_resample rseed(10108)
estimates store all_45_mile_hole
summ dist_to_border if e(sample)
/*
When using "rseed(10101)", we got the following error message:
"note: convergence for the lasso penalty = 0.562097 not reached after 100000 iterations; solutions for larger penalty values returned. no variable is selected"
When using "rseed(10102)", we got the following error message:
"note: convergence for the lasso penalty = 0.553119 not reached after 100000 iterations; solutions for larger penalty values returned. no variable is selected"
When using "rseed(10103)", we got the following error message:
"note: convergence for the lasso penalty = 0.548972 not reached after 100000 iterations; solutions for larger penalty values returned. no variable is selected"
When using "rseed(10104)", we got the following error message:
"note: convergence for the lasso penalty = 0.403765 not reached after 100000 iterations; solutions for larger penalty values returned. no variable is selected"
When using "rseed(10105)", we got the following error message:
"note: convergence for the lasso penalty = 0.548972 not reached after 100000 iterations; solutions for larger penalty values returned. no variable is selected"
When using "rseed(10106)", we got the following error message:
"note: convergence for the lasso penalty = 0.402197 not reached after 100000 iterations; solutions for larger penalty values returned. no variable is selected"
When using "rseed(10107)", we got the following error message:
"note: convergence for the lasso penalty = 0.548848 not reached after 100000 iterations; solutions for larger penalty values returned. no variable is selected"
As a result, we used "rseed(10108)" and then did't get any error message. 
*/

forvalues k = 0(5)45 {
    quietly estimates restore all_`k'_mile_hole
    local n_obs_`k'_mile_hole = e(N)
}
coefplot (all_0_mile_hole,  label("no hole, n = `n_obs_0_mile_hole'")) ///
         (all_5_mile_hole,  label("5-mile radius hole, n = `n_obs_5_mile_hole'")) ///  
         (all_10_mile_hole, label("10-mile radius hole, n = `n_obs_10_mile_hole'")) ///
         (all_15_mile_hole, label("15-mile radius hole, n = `n_obs_15_mile_hole'")) ///
         (all_20_mile_hole, label("20-mile radius hole, n = `n_obs_20_mile_hole'")) ///
         (all_25_mile_hole, label("25-mile radius hole, n = `n_obs_25_mile_hole'")) ///
         (all_30_mile_hole, label("30-mile radius hole, n = `n_obs_30_mile_hole'")) ///
         (all_35_mile_hole, label("35-mile radius hole, n = `n_obs_35_mile_hole'")) ///
         (all_40_mile_hole, label("40-mile radius hole, n = `n_obs_40_mile_hole'")) ///
         (all_45_mile_hole, label("45-mile radius hole, n = `n_obs_45_mile_hole'")) ///
         , keep(r1vs0.treat) ///
         coeflabels(r1vs0.treat = `""Treat (1/0):" "1 for census tracts" "located east of" "the time zone border;" "0 for census tracts" "located west of" "the time zone border""', labsize(*0.8)) ///
         mlabels mlabsize(*0.8) mlabcolor(black) mlabformat(%9.3f) mlabposition(12) mlabgap(*2) ///
         xline(0, lpattern(dash) lwidth(*2) lcolor(gray)) ///
         levels(95 90) ciopts(lwidth(*3 *3) lcolor(*.3 *.6)) ///
         xlabel(-0.5(0.5)3.5) ///
         ylabel(, noticks) ///
         legend(size(*0.8)) ///
         title("Dependent variable: depression", size(*0.8) span) ///
         subtitle("The dependent variable is defined as the prevalence of adults aged 18 years or older who report having been told by a doctor, nurse or other health professional" ///
                  "that they had depressive disorder (including depression, major depression, dysthymia, or minor depression)." ///
                  "The prevalence is at the census tract level, measured annually, and measured in 0–100 percentage points.", size(*0.6) span) ///
         note("Point estimates and the confidence intervals at the 95% level and the 90% level (in darker color) are reported in the figure.", size(*0.8)) ///
         scale(0.7)

graph save "code\analysis\figures\\`pgm'", replace
graph export "code\analysis\figures\\`pgm'.png", replace
graph export "code\analysis\figures\\`pgm'.emf", replace

log close
exit